In [74]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
In [75]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
In [76]:
test = pd.read_csv("test.csv")
test.head()
Out[76]:
| PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 892 | 3 | Kelly, Mr. James | male | 34.5 | 0 | 0 | 330911 | 7.8292 | NaN | Q |
| 1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0 | 1 | 0 | 363272 | 7.0000 | NaN | S |
| 2 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
| 3 | 895 | 3 | Wirz, Mr. Albert | male | 27.0 | 0 | 0 | 315154 | 8.6625 | NaN | S |
| 4 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | S |
In [77]:
train = pd.read_csv("train (1).csv")
train.head()
Out[77]:
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
In [78]:
test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 418 entries, 0 to 417 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 418 non-null int64 1 Pclass 418 non-null int64 2 Name 418 non-null object 3 Sex 418 non-null object 4 Age 332 non-null float64 5 SibSp 418 non-null int64 6 Parch 418 non-null int64 7 Ticket 418 non-null object 8 Fare 417 non-null float64 9 Cabin 91 non-null object 10 Embarked 418 non-null object dtypes: float64(2), int64(4), object(5) memory usage: 36.1+ KB
In [79]:
test.describe()
Out[79]:
| PassengerId | Pclass | Age | SibSp | Parch | Fare | |
|---|---|---|---|---|---|---|
| count | 418.000000 | 418.000000 | 332.000000 | 418.000000 | 418.000000 | 417.000000 |
| mean | 1100.500000 | 2.265550 | 30.272590 | 0.447368 | 0.392344 | 35.627188 |
| std | 120.810458 | 0.841838 | 14.181209 | 0.896760 | 0.981429 | 55.907576 |
| min | 892.000000 | 1.000000 | 0.170000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 996.250000 | 1.000000 | 21.000000 | 0.000000 | 0.000000 | 7.895800 |
| 50% | 1100.500000 | 3.000000 | 27.000000 | 0.000000 | 0.000000 | 14.454200 |
| 75% | 1204.750000 | 3.000000 | 39.000000 | 1.000000 | 0.000000 | 31.500000 |
| max | 1309.000000 | 3.000000 | 76.000000 | 8.000000 | 9.000000 | 512.329200 |
In [80]:
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 891 non-null int64 1 Survived 891 non-null int64 2 Pclass 891 non-null int64 3 Name 891 non-null object 4 Sex 891 non-null object 5 Age 714 non-null float64 6 SibSp 891 non-null int64 7 Parch 891 non-null int64 8 Ticket 891 non-null object 9 Fare 891 non-null float64 10 Cabin 204 non-null object 11 Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.7+ KB
In [81]:
train.describe()
Out[81]:
| PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
|---|---|---|---|---|---|---|---|
| count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
| mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
| std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
| min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
| 50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
| 75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
| max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
In [82]:
df = pd.concat([train,test], ignore_index=True)
In [83]:
df.head()
Out[83]:
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1.0 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1.0 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1.0 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0.0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
1. Understanding Dataset¶
Key columns:
| Column | Description |
|---|---|
| PassengerId | Unique ID |
| Survived | 0 = No, 1 = Yes |
| Pclass | Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd) |
| Name | Passenger name |
| Sex | Male/Female |
| Age | Age in years |
| SibSp | of siblings/spouses aboard |
| Parch | of parents/children aboard |
| Ticket | Ticket number |
| Fare | Passenger fare |
| Cabin | Cabin number |
| Embarked | Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton) |
In [84]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1309 entries, 0 to 1308 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 1309 non-null int64 1 Survived 891 non-null float64 2 Pclass 1309 non-null int64 3 Name 1309 non-null object 4 Sex 1309 non-null object 5 Age 1046 non-null float64 6 SibSp 1309 non-null int64 7 Parch 1309 non-null int64 8 Ticket 1309 non-null object 9 Fare 1308 non-null float64 10 Cabin 295 non-null object 11 Embarked 1307 non-null object dtypes: float64(3), int64(4), object(5) memory usage: 122.8+ KB
In [85]:
df.describe()
Out[85]:
| PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
|---|---|---|---|---|---|---|---|
| count | 1309.000000 | 891.000000 | 1309.000000 | 1046.000000 | 1309.000000 | 1309.000000 | 1308.000000 |
| mean | 655.000000 | 0.383838 | 2.294882 | 29.881138 | 0.498854 | 0.385027 | 33.295479 |
| std | 378.020061 | 0.486592 | 0.837836 | 14.413493 | 1.041658 | 0.865560 | 51.758668 |
| min | 1.000000 | 0.000000 | 1.000000 | 0.170000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 328.000000 | 0.000000 | 2.000000 | 21.000000 | 0.000000 | 0.000000 | 7.895800 |
| 50% | 655.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
| 75% | 982.000000 | 1.000000 | 3.000000 | 39.000000 | 1.000000 | 0.000000 | 31.275000 |
| max | 1309.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 9.000000 | 512.329200 |
2. Feature Engineering¶
- Title
- Age group
- Family Size
- Family Type
- Individual Fare
- Deck
In [86]:
df["Name"]
Out[86]:
| Name | |
|---|---|
| 0 | Braund, Mr. Owen Harris |
| 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... |
| 2 | Heikkinen, Miss. Laina |
| 3 | Futrelle, Mrs. Jacques Heath (Lily May Peel) |
| 4 | Allen, Mr. William Henry |
| ... | ... |
| 1304 | Spector, Mr. Woolf |
| 1305 | Oliva y Ocana, Dona. Fermina |
| 1306 | Saether, Mr. Simon Sivertsen |
| 1307 | Ware, Mr. Frederick |
| 1308 | Peter, Master. Michael J |
1309 rows × 1 columns
In [87]:
df["Title"] = df["Name"].str.split(',').str[1].str.strip( ).str.split(".").str[0]
In [88]:
df['Title'] = np.where((df["Title"] == "Ms") | (df["Title"] == "Mlle"), "Miss", df["Title"])
df['Title'] = np.where(df["Title"] == "Mme", "Mrs", df["Title"])
In [89]:
df["Title"].value_counts()
Out[89]:
| count | |
|---|---|
| Title | |
| Mr | 757 |
| Miss | 264 |
| Mrs | 198 |
| Master | 61 |
| Rev | 8 |
| Dr | 8 |
| Col | 4 |
| Major | 2 |
| Don | 1 |
| Lady | 1 |
| Sir | 1 |
| Capt | 1 |
| the Countess | 1 |
| Jonkheer | 1 |
| Dona | 1 |
In [90]:
df["Family Size"] = df["SibSp"] + df["Parch"] + 1
In [91]:
df["Family Size"].value_counts()
Out[91]:
| count | |
|---|---|
| Family Size | |
| 1 | 790 |
| 2 | 235 |
| 3 | 159 |
| 4 | 43 |
| 6 | 25 |
| 5 | 22 |
| 7 | 16 |
| 11 | 11 |
| 8 | 8 |
In [92]:
df["Family Type"] = np.where(df["Family Size"] == 1, "Solo",
np.where((df["Family Size"] >1) & (df["Family Size"] < 5), "Small",
np.where((df["Family Size"] > 4)& (df["Family Size"] < 7 ),"Medium", "Large")))
In [93]:
df["Family Type"].value_counts()
Out[93]:
| count | |
|---|---|
| Family Type | |
| Solo | 790 |
| Small | 437 |
| Medium | 47 |
| Large | 35 |
In [94]:
df["Individual Fare"] = df["Fare"]/df["Family Size"]
In [95]:
df["Deck"] = df["Cabin"].str[0]
df["Deck"].value_counts()
Out[95]:
| count | |
|---|---|
| Deck | |
| C | 94 |
| B | 65 |
| D | 46 |
| E | 41 |
| A | 22 |
| F | 21 |
| G | 5 |
| T | 1 |
In [96]:
df["Age Group"] = pd.cut(df["Age"], bins=[0,2,17,30,45,100], labels=["Baby", "Child", "Young Adult", "Middle Aged Adult", "Senior"])
df["Age Group"].value_counts()
Out[96]:
| count | |
|---|---|
| Age Group | |
| Young Adult | 455 |
| Middle Aged Adult | 282 |
| Senior | 155 |
| Child | 120 |
| Baby | 34 |
3. Data Analysis¶
- Univariate Analysis
- Bivariate Analysis
- Multivariate Analysis
a. Univariate Analysis:¶
methods for statistics & distribution¶
In [97]:
def stats(df, col):
if pd.api.types.is_numeric_dtype(df[col]):
print(col, "- Numerical Column")
print()
null_count = df[col].isnull().sum().item()
print("\nNull values are: ",null_count)
null_percent = round((null_count/df.shape[0])*100,2)
print("\nNull values percentage is: ",null_percent)
mean = df[col].mean()
print("\nMean is: ",mean)
median = df[col].median()
print("\nMedian is: ",median)
mode = df[col].mode()[0]
print("\nMode is: ",mode)
std = df[col].std()
print("\nStandard Deviation is: ",std)
var = df[col].var()
print("\nVariance is: ",var)
skew = df[col].skew()
print("\nSkewness is: ",skew)
kurt = df[col].kurt()
print("\nKurtosis is: ",kurt)
else:
print(col,"- Categorical Column")
print()
null_count = df[col].isnull().sum().item()
print("\nNull values are: ",df[col].isnull().sum().item())
null_percent = round((null_count/df.shape[0])*100,2)
print("\nNull values percentage is: ",null_percent)
mode = df[col].mode()[0]
print("\nMode is: ",mode)
print("\nValue Counts are: \n")
print(df[col].value_counts())
In [98]:
def num_cat_plots(df, col):
# numerical column
if pd.api.types.is_numeric_dtype(df[col]):
print()
print("Numerical column Analysis : -\n")
print()
stats(df,col)
print()
plt.figure(figsize=(17,5))
sns.set_style("whitegrid")
sns.set_palette("Set2")
plt.subplot(1,3,1)
sns.histplot(x=df[col])
plt.title("Histogram")
plt.subplot(1,3,2)
sns.kdeplot(x=df[col],fill=True)
plt.title("KDE Plot")
plt.subplot(1,3,3)
sns.boxplot(x=df[col])
plt.title("Boxplot")
plt.show()
else:
print()
# Categorical column
print("Categorical column Analysis : -\n")
stats(df,col)
#count plot
plt.figure(figsize=(12,5))
sns.countplot(data=df, x=col)
plt.title(f"Count of different {col}")
plt.show()
# Pie chart
pie_df = df[col].value_counts().reset_index()
pie_df.columns = ["Category", "Count"]
fig = px.pie(pie_df,
names = "Category",
values = "Count" ,
title = (f"Proportion of different {col}"))
fig.update_layout(
title_x = 0.5,
title_font = dict(size = 20 , color = "Red")
)
fig.show()
Analysis of all columns¶
In [99]:
all_columns = ['Age','Fare', 'Individual Fare',"Survived", 'Pclass', 'SibSp', 'Parch','Embarked', 'Family Size', 'Family Type', 'Deck','Sex','Age Group','Title']
all_cat_columns = ["Survived", 'Pclass', 'SibSp', 'Parch','Embarked', 'Family Size', 'Family Type', 'Deck','Sex','Title']
df1 = df.copy()
for i in all_columns :
if i in all_cat_columns:
df1[i] = df[i].astype('category')
df1[i].dtype
num_cat_plots(df1,i)
Numerical column Analysis : - Age - Numerical Column Null values are: 263 Null values percentage is: 20.09 Mean is: 29.881137667304014 Median is: 28.0 Mode is: 24.0 Standard Deviation is: 14.413493211271334 Variance is: 207.74878655136482 Skewness is: 0.40767455974362266 Kurtosis is: 0.1469476357378139
Numerical column Analysis : - Fare - Numerical Column Null values are: 1 Null values percentage is: 0.08 Mean is: 33.29547928134557 Median is: 14.4542 Mode is: 8.05 Standard Deviation is: 51.75866823917414 Variance is: 2678.959737892894 Skewness is: 4.367709134122922 Kurtosis is: 27.027986349442294
Numerical column Analysis : - Individual Fare - Numerical Column Null values are: 1 Null values percentage is: 0.08 Mean is: 20.51821514307558 Median is: 8.512483333333332 Mode is: 13.0 Standard Deviation is: 35.774336893842424 Variance is: 1279.8031801941354 Skewness is: 6.683189172409639 Kurtosis is: 66.46361442187475
Categorical column Analysis : - Survived - Categorical Column Null values are: 418 Null values percentage is: 31.93 Mode is: 0.0 Value Counts are: Survived 0.0 549 1.0 342 Name: count, dtype: int64
Categorical column Analysis : - Pclass - Categorical Column Null values are: 0 Null values percentage is: 0.0 Mode is: 3 Value Counts are: Pclass 3 709 1 323 2 277 Name: count, dtype: int64
Categorical column Analysis : - SibSp - Categorical Column Null values are: 0 Null values percentage is: 0.0 Mode is: 0 Value Counts are: SibSp 0 891 1 319 2 42 4 22 3 20 8 9 5 6 Name: count, dtype: int64
Categorical column Analysis : - Parch - Categorical Column Null values are: 0 Null values percentage is: 0.0 Mode is: 0 Value Counts are: Parch 0 1002 1 170 2 113 3 8 4 6 5 6 6 2 9 2 Name: count, dtype: int64
Categorical column Analysis : - Embarked - Categorical Column Null values are: 2 Null values percentage is: 0.15 Mode is: S Value Counts are: Embarked S 914 C 270 Q 123 Name: count, dtype: int64
Categorical column Analysis : - Family Size - Categorical Column Null values are: 0 Null values percentage is: 0.0 Mode is: 1 Value Counts are: Family Size 1 790 2 235 3 159 4 43 6 25 5 22 7 16 11 11 8 8 Name: count, dtype: int64
Categorical column Analysis : - Family Type - Categorical Column Null values are: 0 Null values percentage is: 0.0 Mode is: Solo Value Counts are: Family Type Solo 790 Small 437 Medium 47 Large 35 Name: count, dtype: int64
Categorical column Analysis : - Deck - Categorical Column Null values are: 1014 Null values percentage is: 77.46 Mode is: C Value Counts are: Deck C 94 B 65 D 46 E 41 A 22 F 21 G 5 T 1 Name: count, dtype: int64
Categorical column Analysis : - Sex - Categorical Column Null values are: 0 Null values percentage is: 0.0 Mode is: male Value Counts are: Sex male 843 female 466 Name: count, dtype: int64
Categorical column Analysis : - Age Group - Categorical Column Null values are: 263 Null values percentage is: 20.09 Mode is: Young Adult Value Counts are: Age Group Young Adult 455 Middle Aged Adult 282 Senior 155 Child 120 Baby 34 Name: count, dtype: int64
Categorical column Analysis : - Title - Categorical Column Null values are: 0 Null values percentage is: 0.0 Mode is: Mr Value Counts are: Title Mr 757 Miss 264 Mrs 198 Master 61 Dr 8 Rev 8 Col 4 Major 2 Lady 1 Dona 1 Don 1 Capt 1 Jonkheer 1 Sir 1 the Countess 1 Name: count, dtype: int64
b. Bivariate Analysis:¶
i. Categorical - Categorical Analysis -¶
Survival - Other categorical column analysis¶
Method for survival rate , survival count & graphs¶
In [168]:
## survival rate
def survival_rate(df, *col):
if not col:
overall_rate = df["Survived"].mean()
return pd.DataFrame({"Overall_Survival_rate": [overall_rate]})
rate = df.groupby(list(col))["Survived"].mean().reset_index().rename(
columns={"Survived": "Survival_rate"}
)
return rate
## survival count
def survival_count(df, *col):
if not col:
overall_counts = df["Survived"].value_counts().reset_index()
overall_counts.columns = ['Survived', 'Count']
return overall_counts
grouping_list = list(col) + ["Survived"]
count = df.groupby(grouping_list).size().reset_index(name="Count")
return count
## Graphs
def cat_survial(df,col):
temp2 = survival_rate(df,col)
sns.barplot(data=temp2, x=col, y="Survival_rate")
plt.title(f"Survival rate across {col}")
plt.xticks(rotation=45)
plt.show()
xx= survival_count(df,col)
xx["Rate"] = xx["Count"]/xx.groupby(col)["Count"].transform("sum")
sns.barplot(data=xx , x = "Survived", y="Rate", hue = col)
plt.title(f"Survival & Non Survival Rate across {col}")
plt.xticks(rotation=45)
plt.legend(title=col, bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
temp1 = survival_count(df,col)
fig = px.bar(temp1, x = "Survived", y = "Count", color = col , barmode = "stack",
height=500, width=700, title= f"Survival count of {col}")
fig.update_layout(
title_x = 0.5,
title_font = dict(size = 20 , color = "Green")
)
fig.show()
Analysis on survival rate + survival count of all categorical columns¶
In [101]:
for i in all_cat_columns:
if i != "Survived":
cat_survial(df,i)
P class - Other categorical column analysis¶
Methods for proportion of pclass across each categorical col¶
In [102]:
def pclass_cat_plot(df, col):
# group by Pclass and categorical column
xx = (
df.groupby(["Pclass", col])
.size()
.reset_index(name="Count")
.sort_values(by="Count", ascending=False)
)
# proportion within each Pclass
xx["Proportion %"] = (
xx["Count"] / xx.groupby("Pclass")["Count"].transform("sum")
) * 100
# barplot (proportion per category)
plt.figure(figsize=(8,5))
sns.barplot(data=xx, x=col, y="Proportion %", hue="Pclass")
plt.title(f"Pclass-wise Proportion Distribution for {col}")
plt.xticks(rotation=45)
plt.show()
# Stacked bar chart
pivot = xx.pivot(index="Pclass", columns=col, values="Proportion %").fillna(0)
pivot.plot(kind="bar", stacked=True, figsize=(8,5))
plt.ylabel("Proportion %")
plt.title(f"Stacked Bar – Pclass vs {col}")
plt.xticks(rotation=0)
plt.legend(title=col, bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
Analysis on proportion of pclass across each categorical col¶
In [103]:
for i in all_cat_columns:
if i != "Pclass":
pclass_cat_plot(df,i)
Sex - Other categorical column analysis¶
In [104]:
temp_df = df.groupby(["Family Type","Sex"]).size().reset_index(name="Count")
fig = px.bar(temp_df, x="Family Type", y='Count' , color = "Sex", title="Count of Male/Female across family types" )
fig.update_layout(
title_x = 0.5,
title_font = dict(size = 18 , color = "Green")
)
In [105]:
temp_df = df.groupby(["Deck","Sex"]).size().reset_index(name="Count")
sns.barplot(data = temp_df, x="Deck", y='Count' , hue = "Sex" )
plt.title("Count of Male/Female in each deck")
plt.show()
In [106]:
temp_df = df.groupby(["Embarked","Sex"]).size().reset_index(name="Count")
sns.barplot(data = temp_df, x="Embarked", y='Count' , hue = "Sex" )
plt.title("Count of Male/Female in each Embarked")
plt.show()
ii. Numerical - Numerical Analysis -¶
In [107]:
def num_num_analysis(df, col1, col2):
# Scatter Plot
plt.figure(figsize=(7,5))
sns.scatterplot(data=df, x=col1, y=col2)
plt.title(f"Scatter Plot: {col1} vs {col2}")
plt.show()
# Joint Plot
sns.jointplot(data=df, x=col1, y=col2, kind="scatter", height=7)
plt.suptitle(f"Joint Plot: {col1} vs {col2}", y=1.02)
plt.show()
# KDE Plot
plt.figure(figsize=(7,5))
sns.kdeplot(data=df, x=col1, y=col2)
plt.title(f"KDE Plot: {col1} vs {col2}")
plt.show()
In [108]:
num_num_analysis(df,'Age','Fare')
In [109]:
num_num_analysis(df,'Age','Individual Fare')
iii. Numerical - categorical Analysis -¶
In [110]:
def num_cat_analysis(df, num_col, cat_col, agg="mean"):
# Boxplot
plt.figure(figsize=(8,5))
sns.boxplot(data=df, x=cat_col, y=num_col)
plt.title(f"Boxplot of {num_col} across {cat_col}")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Violin Plot
plt.figure(figsize=(8,5))
sns.violinplot(data=df, x=cat_col, y=num_col)
plt.title(f"Violin Plot of {num_col} across {cat_col}")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Aggregated Bar Plot: Mean / Median
temp = df.groupby(cat_col)[num_col].agg(agg).reset_index()
temp.rename(columns={num_col: f"{agg}_{num_col}"}, inplace=True)
plt.figure(figsize=(7,4))
sns.barplot(data=temp, x=cat_col, y=f"{agg}_{num_col}", palette="viridis")
plt.title(f"{agg.title()} {num_col} by {cat_col}")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
In [111]:
for i in ['Survived',"Pclass","Sex","Age Group","Embarked","Family Type"]:
print("Fare vs other categorical columns")
num_cat_analysis(df, 'Fare', i, agg="mean")
print("Age vs other categorical columns")
num_cat_analysis(df, 'Age', i, agg="mean")
print("Individual Fare vs other categorical columns")
num_cat_analysis(df, 'Individual Fare', i, agg="mean")
Fare vs other categorical columns
Age vs other categorical columns
Individual Fare vs other categorical columns
Fare vs other categorical columns
Age vs other categorical columns
Individual Fare vs other categorical columns
Fare vs other categorical columns
Age vs other categorical columns
Individual Fare vs other categorical columns
Fare vs other categorical columns
Age vs other categorical columns
Individual Fare vs other categorical columns
Fare vs other categorical columns
Age vs other categorical columns
Individual Fare vs other categorical columns
Fare vs other categorical columns
Age vs other categorical columns
Individual Fare vs other categorical columns
c. Multivariate Analysis:¶
Pclass + Sex + Survived¶
In [112]:
temp = df.groupby(["Pclass","Sex","Survived"]).size().reset_index(name="Count")
#Clustered barchart
plt.figure(figsize=(8,5))
sns.barplot(data=df, x="Pclass", y="Survived", hue="Sex")
plt.title("Survival Rate by Pclass and Sex")
plt.show()
# sunburst
px.sunburst(temp , path = ["Pclass", "Sex","Survived"], color = "Count",
height = 500, width = 500, title = "Proportion of survivals across Pclass , Sex")
Age + Sex + Survived¶
In [113]:
plt.figure(figsize=(8,5))
sns.boxplot(data=df, x="Sex", y="Age", hue="Survived")
plt.title("Age Distribution by Sex and Survival")
plt.show()
g = sns.FacetGrid(df, col="Sex", hue="Survived", height=4)
g.map(sns.kdeplot, "Age", fill=True, alpha=0.6)
g.add_legend()
plt.show()
Fare + Pclass + Survived¶
In [114]:
plt.figure(figsize=(8,5))
sns.scatterplot(data=df, x="Fare", y="Pclass", hue="Survived", alpha=0.7)
plt.title("Fare vs Pclass by Survival")
plt.show()
plt.figure(figsize=(8,5))
sns.boxplot(data=df, x="Pclass", y="Fare", hue="Survived")
plt.title("Fare Distribution by Pclass and Survival")
plt.show()
Embarked + Pclass + Survived¶
In [115]:
temp = df.groupby(["Embarked","Pclass","Survived"]).size().reset_index(name="Count")
fig = px.sunburst(temp , path = ["Embarked","Pclass","Count"],color = "Count",
height = 500, width = 500, title = "Count of passengers across different pclass , embark")
fig.show()
plt.figure(figsize=(8,5))
sns.barplot(data=df, x="Embarked", y="Survived", hue="Pclass")
plt.title("Survival Rate by Embarked Port and Pclass")
plt.show()
FamilySize + Pclass + Survived¶
In [116]:
plt.figure(figsize=(8,5))
sns.barplot(data=df, x="Family Size", y="Survived", hue="Pclass")
plt.title("Survival Rate by Family Size and Pclass")
plt.show()
Age + Fare + Survived¶
In [117]:
plt.figure(figsize=(8,5))
sns.scatterplot(data=df, x="Age", y="Fare", hue="Survived", alpha=0.7)
plt.title("Age vs Fare by Survival")
plt.show()
sns.jointplot(data=df, x="Age", y="Fare", hue="Survived", kind="kde")
plt.suptitle("KDE Joint Distribution – Age & Fare", y=1.02)
plt.show()
Age + Pclass + Survived¶
In [118]:
def age_pclass_survived_plots(df):
plt.figure(figsize=(14,5))
# Boxplot: Age vs Pclass split by Survived
sns.boxplot(data=df, x="Pclass", y="Age", hue="Survived", palette="Set2")
plt.title("Age Distribution Across Pclass (Survived vs Not)")
plt.show()
# Swarm Plot
plt.figure(figsize=(14,5))
sns.swarmplot(data=df, x="Pclass", y="Age", hue="Survived", palette="cool", dodge=True)
plt.title("Swarm Plot: Age vs Pclass by Survival")
plt.show()
# Histogram Plot
plt.figure(figsize=(14,5))
sns.histplot(data=df, x="Age", hue="Pclass", multiple="stack", kde=True)
plt.title("Stacked Age Histogram by Pclass")
plt.show()
age_pclass_survived_plots(df)
Survived + Deck + Pclass¶
In [119]:
def survived_deck_pclass_plots(df):
temp = df.dropna(subset=["Deck"])
# grouped barchart
xx = temp.groupby(["Deck", "Pclass", "Survived"]).size().reset_index(name="Count")
xx["Proportion Percentage"] = xx["Count"] / xx.groupby(["Deck", "Pclass"])["Count"].transform("sum")
plt.figure(figsize=(12,6))
sns.barplot(data=xx, x="Deck", y="Proportion Percentage", hue="Survived", palette="Set2")
plt.title("Proportion Percentage of Survival by Deck")
plt.show()
# heatmap
heat = temp.groupby(["Deck", "Pclass"])["Survived"].mean().unstack()
plt.figure(figsize=(10,6))
sns.heatmap(heat, annot=True, cmap="YlGnBu")
plt.title("Survival Rate Heatmap (Deck vs Pclass)")
plt.show()
survived_deck_pclass_plots(df)
4. Handling Null Values :¶
Methods for null value filling¶
In [120]:
## Checking fill values
def fill_methods(df, col):
s = df[col]
filled = {}
if pd.api.types.is_numeric_dtype(df[col]): # NUMERIC
mean_val = s.mean()
median_val = s.median()
mode_val = s.mode()[0]
non_null = s.dropna()
rand_one = np.random.choice(non_null) # 1 random value from col
rand_many = np.random.choice(non_null, size=s.isna().sum()) # multiple random values from col
# null values filled with different methods
filled["mean"] = s.fillna(mean_val)
filled["median"] = s.fillna(median_val)
filled["mode"] = s.fillna(mode_val)
filled["rand1"] = s.fillna(rand_one)
filled["zero"] = s.fillna(0) # filling with arbitrary value 0
r = s.copy()
r[r.isna()] = rand_many # filling with multiplr random values
filled["rand-many"] = r
else: # CATEGORICAL
mode_val = s.mode()[0]
non_null = s.dropna()
# Create a temporary series to add new categories for imputation
temp_s = s.copy()
if "None" not in temp_s.cat.categories:
temp_s = temp_s.cat.add_categories("None")
if "Unknown" not in temp_s.cat.categories:
temp_s = temp_s.cat.add_categories("Unknown")
filled["mode"] = temp_s.fillna(mode_val)
filled["none"] = temp_s.fillna("None")
filled["unknown"] = temp_s.fillna("Unknown")
filled["rand-cat"] = temp_s.fillna(np.random.choice(non_null))
temp = pd.DataFrame(filled)
return filled, temp
# Plotting imputation methods
def plots(original, filled_dict, colname):
is_numeric = pd.api.types.is_numeric_dtype(original)
for method, series in filled_dict.items():
# KDE Plot (only for numeric columns)
if is_numeric:
plt.figure(figsize=(10,5))
plt.title(f"KDE Plot Comparison for {colname} — {method}")
original.dropna().plot(kind='kde', label='Original')
series.plot(kind='kde', label=method)
plt.legend()
plt.show()
# Histogram (works for both numeric and categorical)
plt.figure(figsize=(10,5))
plt.title(f"Histogram Comparison for {colname} — {method}")
plt.hist(original.dropna(), alpha=0.5, label="Original")
plt.hist(series, alpha=0.5, label=method)
plt.legend()
plt.show()
# Boxplot (only for numeric columns)
if is_numeric:
plt.figure(figsize=(6,4))
plt.title(f"Boxplot Comparison for {colname} — {method}")
plt.boxplot([original.dropna(), series], labels=["Original", method])
plt.show()
a. Age:-¶
In [121]:
## finding best imputation method
fill_dict , temp_df = fill_methods(df, "Age")
temp_df["Original"] = df["Age"]
temp_df
Out[121]:
| mean | median | mode | rand1 | zero | rand-many | Original | |
|---|---|---|---|---|---|---|---|
| 0 | 22.000000 | 22.0 | 22.0 | 22.0 | 22.0 | 22.0 | 22.0 |
| 1 | 38.000000 | 38.0 | 38.0 | 38.0 | 38.0 | 38.0 | 38.0 |
| 2 | 26.000000 | 26.0 | 26.0 | 26.0 | 26.0 | 26.0 | 26.0 |
| 3 | 35.000000 | 35.0 | 35.0 | 35.0 | 35.0 | 35.0 | 35.0 |
| 4 | 35.000000 | 35.0 | 35.0 | 35.0 | 35.0 | 35.0 | 35.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1304 | 29.881138 | 28.0 | 24.0 | 34.0 | 0.0 | 36.0 | NaN |
| 1305 | 39.000000 | 39.0 | 39.0 | 39.0 | 39.0 | 39.0 | 39.0 |
| 1306 | 38.500000 | 38.5 | 38.5 | 38.5 | 38.5 | 38.5 | 38.5 |
| 1307 | 29.881138 | 28.0 | 24.0 | 34.0 | 0.0 | 24.0 | NaN |
| 1308 | 29.881138 | 28.0 | 24.0 | 34.0 | 0.0 | 33.0 | NaN |
1309 rows × 7 columns
In [122]:
## plotting graph for each imputation method
plots(df['Age'], fill_dict , 'Age')
/tmp/ipython-input-1043010543.py:76: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
/tmp/ipython-input-1043010543.py:76: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
/tmp/ipython-input-1043010543.py:76: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
/tmp/ipython-input-1043010543.py:76: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
/tmp/ipython-input-1043010543.py:76: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
/tmp/ipython-input-1043010543.py:76: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
In [123]:
# before replacing null values
print("Null values (before): ", df["Age"].isnull().sum())
# After replacing null values
df["Age"]=temp_df["rand-many"]
print("Null values (after): ", df["Age"].isnull().sum())
Null values (before): 263 Null values (after): 0
Age Distribution after filling null values :¶
In [146]:
sns.kdeplot(x = df["Age"])
plt.title("Age distribution after replacing null values")
plt.show()
b. Fare , Individual Fare¶
In [125]:
stats(df, "Fare")
Fare - Numerical Column Null values are: 1 Null values percentage is: 0.08 Mean is: 33.29547928134557 Median is: 14.4542 Mode is: 8.05 Standard Deviation is: 51.75866823917414 Variance is: 2678.959737892894 Skewness is: 4.367709134122922 Kurtosis is: 27.027986349442294
In [126]:
stats(df , "Individual Fare")
Individual Fare - Numerical Column Null values are: 1 Null values percentage is: 0.08 Mean is: 20.51821514307558 Median is: 8.512483333333332 Mode is: 13.0 Standard Deviation is: 35.774336893842424 Variance is: 1279.8031801941354 Skewness is: 6.683189172409639 Kurtosis is: 66.46361442187475
In [127]:
df[df["Fare"].isnull()]
Out[127]:
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Title | Family Size | Family Type | Individual Fare | Deck | Age Group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1043 | 1044 | NaN | 3 | Storey, Mr. Thomas | male | 60.5 | 0 | 0 | 3701 | NaN | NaN | S | Mr | 1 | Solo | NaN | NaN | Senior |
In [128]:
df[df["Individual Fare"].isnull()]
Out[128]:
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Title | Family Size | Family Type | Individual Fare | Deck | Age Group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1043 | 1044 | NaN | 3 | Storey, Mr. Thomas | male | 60.5 | 0 | 0 | 3701 | NaN | NaN | S | Mr | 1 | Solo | NaN | NaN | Senior |
Replacing null value :¶
In [147]:
# before replacing null values
print("Null values in fare (before): ", df["Fare"].isnull().sum())
print("Null values in individual (before): ", df["Individual Fare"].isnull().sum())
# After replacing null values
df["Fare"]= df["Fare"].fillna(df["Fare"].median())
print("Null values in fare (after): ", df["Fare"].isnull().sum())
df["Individual Fare"]= df["Individual Fare"].fillna(df["Individual Fare"].median())
print("Null values in Individual Fare (after): ", df["Individual Fare"].isnull().sum())
Null values in fare (before): 1 Null values in individual (before): 1 Null values in fare (after): 0 Null values in Individual Fare (after): 0
Fare, Individual Fare Distribution after filling null values:¶
In [148]:
sns.kdeplot(data = df,x = df["Fare"], label = "Fare")
plt.title("Fare distribution after replacing null values")
sns.kdeplot(data = df,x = df["Individual Fare"], label = "Individual Fare")
plt.title("Individual Fare distribution after replacing null values")
plt.legend()
plt.show()
c. Age Group¶
Handling Null Values:¶
In [149]:
fill_dict , temp_df = fill_methods(df, "Age Group")
temp_df["Original"] = df["Age Group"]
temp_df
Out[149]:
| mode | none | unknown | rand-cat | Original | |
|---|---|---|---|---|---|
| 0 | Young Adult | Young Adult | Young Adult | Young Adult | Young Adult |
| 1 | Middle Aged Adult | Middle Aged Adult | Middle Aged Adult | Middle Aged Adult | Middle Aged Adult |
| 2 | Young Adult | Young Adult | Young Adult | Young Adult | Young Adult |
| 3 | Middle Aged Adult | Middle Aged Adult | Middle Aged Adult | Middle Aged Adult | Middle Aged Adult |
| 4 | Middle Aged Adult | Middle Aged Adult | Middle Aged Adult | Middle Aged Adult | Middle Aged Adult |
| ... | ... | ... | ... | ... | ... |
| 1304 | Young Adult | None | Unknown | Middle Aged Adult | NaN |
| 1305 | Middle Aged Adult | Middle Aged Adult | Middle Aged Adult | Middle Aged Adult | Middle Aged Adult |
| 1306 | Middle Aged Adult | Middle Aged Adult | Middle Aged Adult | Middle Aged Adult | Middle Aged Adult |
| 1307 | Young Adult | None | Unknown | Middle Aged Adult | NaN |
| 1308 | Young Adult | None | Unknown | Middle Aged Adult | NaN |
1309 rows × 5 columns
In [150]:
plots(df['Age Group'], fill_dict , 'Age Group')
Replacing Null Values:¶
In [151]:
# before replacing null values
print("Null values (before): ", df["Age Group"].isnull().sum())
# After replacing null values
df["Age Group"]=temp_df["mode"]
print("Null values (after): ", df["Age Group"].isnull().sum())
Null values (before): 263 Null values (after): 0
Age group Distribution after filling null values:¶
In [152]:
df["Age Group"].value_counts().plot(kind = "bar")
plt.title("Passenger count across different age groups")
plt.show()
In [153]:
df["Age Group"].value_counts().plot(kind = "pie", autopct = "%0.1f%%")
plt.title("Passenger Proportion across different age groups")
plt.show()
d. Deck¶
Handling Null values :¶
In [154]:
df["Deck"] = df["Deck"].astype('category')
fill_dict , temp_df = fill_methods(df, "Deck")
temp_df["Original"] = df["Deck"]
temp_df
Out[154]:
| mode | none | unknown | rand-cat | Original | |
|---|---|---|---|---|---|
| 0 | C | None | Unknown | C | NaN |
| 1 | C | C | C | C | C |
| 2 | C | None | Unknown | C | NaN |
| 3 | C | C | C | C | C |
| 4 | C | None | Unknown | C | NaN |
| ... | ... | ... | ... | ... | ... |
| 1304 | C | None | Unknown | C | NaN |
| 1305 | C | C | C | C | C |
| 1306 | C | None | Unknown | C | NaN |
| 1307 | C | None | Unknown | C | NaN |
| 1308 | C | None | Unknown | C | NaN |
1309 rows × 5 columns
In [137]:
plots(df["Deck"],fill_dict,"Deck")
Replacing Null Values:¶
In [155]:
# before replacing null values
print("Null values (before): ", df["Deck"].isnull().sum())
# After replacing null values
df["Deck"]=temp_df["unknown"]
print("Null values (after): ", df["Deck"].isnull().sum())
Null values (before): 1014 Null values (after): 0
Deck Distribution after filling null values:¶
In [156]:
df["Deck"].value_counts().plot(kind = "bar")
plt.title("Passenger count across different Decks")
plt.show()
In [157]:
df["Deck"].value_counts().plot(kind = "pie" , autopct = "%0.1f%%" )
plt.title("Passenger proportion across different Decks")
plt.show()
5. Handling Outliers :¶
Detect Outliers in Fare (IQR Method)¶
In [141]:
def outlier_detection_iqr(df, col):
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
outliers = df[(df[col] < lower) | (df[col] > upper)]
print(f"Total Fare Outliers Found: {len(outliers)}")
print(f"Lower Bound: {lower:.2f}, Upper Bound: {upper:.2f}")
return outliers
outlier_detection_iqr(df, "Fare")
Total Fare Outliers Found: 171 Lower Bound: -27.17, Upper Bound: 66.34
Out[141]:
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Title | Family Size | Family Type | Individual Fare | Deck | Age Group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2 | 1.0 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | Mrs | 2 | Small | 35.641650 | C | Middle Aged Adult |
| 27 | 28 | 0.0 | 1 | Fortune, Mr. Charles Alexander | male | 19.0 | 3 | 2 | 19950 | 263.0000 | C23 C25 C27 | S | Mr | 6 | Medium | 43.833333 | C | Young Adult |
| 31 | 32 | 1.0 | 1 | Spencer, Mrs. William Augustus (Marie Eugenie) | female | 32.0 | 1 | 0 | PC 17569 | 146.5208 | B78 | C | Mrs | 2 | Small | 73.260400 | B | NaN |
| 34 | 35 | 0.0 | 1 | Meyer, Mr. Edgar Joseph | male | 28.0 | 1 | 0 | PC 17604 | 82.1708 | NaN | C | Mr | 2 | Small | 41.085400 | NaN | Young Adult |
| 52 | 53 | 1.0 | 1 | Harper, Mrs. Henry Sleeper (Myna Haxtun) | female | 49.0 | 1 | 0 | PC 17572 | 76.7292 | D33 | C | Mrs | 2 | Small | 38.364600 | D | Senior |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1288 | 1289 | NaN | 1 | Frolicher-Stehli, Mrs. Maxmillian (Margaretha ... | female | 48.0 | 1 | 1 | 13567 | 79.2000 | B41 | C | Mrs | 3 | Small | 26.400000 | B | Senior |
| 1291 | 1292 | NaN | 1 | Bonnell, Miss. Caroline | female | 30.0 | 0 | 0 | 36928 | 164.8667 | C7 | S | Miss | 1 | Solo | 164.866700 | C | Young Adult |
| 1298 | 1299 | NaN | 1 | Widener, Mr. George Dunton | male | 50.0 | 1 | 1 | 113503 | 211.5000 | C80 | C | Mr | 3 | Small | 70.500000 | C | Senior |
| 1302 | 1303 | NaN | 1 | Minahan, Mrs. William Edward (Lillian E Thorpe) | female | 37.0 | 1 | 0 | 19928 | 90.0000 | C78 | Q | Mrs | 2 | Small | 45.000000 | C | Middle Aged Adult |
| 1305 | 1306 | NaN | 1 | Oliva y Ocana, Dona. Fermina | female | 39.0 | 0 | 0 | PC 17758 | 108.9000 | C105 | C | Dona | 1 | Solo | 108.900000 | C | Middle Aged Adult |
171 rows × 18 columns
In [142]:
def outlier_plot(df,col):
plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
sns.boxplot(data = df, x=df[col])
plt.title(f"Boxplot: {col} with Outliers")
plt.subplot(1,2,2)
sns.histplot(df[col], kde=True)
plt.title(f"Distribution of {col}")
plt.show()
outlier_plot(df,'Fare')
In [158]:
def capping_outliers(df,col):
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
df[col] = df[col].clip(lower, upper)
print(f"{col}Outliers Treated Using Capping.")
return df
capping_outliers(df,"Fare")
FareOutliers Treated Using Capping.
Out[158]:
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Title | Family Size | Family Type | Individual Fare | Deck | Age Group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | Mr | 2 | Small | 3.625000 | Unknown | Young Adult |
| 1 | 2 | 1.0 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 66.3438 | C85 | C | Mrs | 2 | Small | 35.641650 | C | Middle Aged Adult |
| 2 | 3 | 1.0 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | Miss | 1 | Solo | 7.925000 | Unknown | Young Adult |
| 3 | 4 | 1.0 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | Mrs | 2 | Small | 26.550000 | C | Middle Aged Adult |
| 4 | 5 | 0.0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | Mr | 1 | Solo | 8.050000 | Unknown | Middle Aged Adult |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1304 | 1305 | NaN | 3 | Spector, Mr. Woolf | male | 36.0 | 0 | 0 | A.5. 3236 | 8.0500 | NaN | S | Mr | 1 | Solo | 8.050000 | Unknown | Young Adult |
| 1305 | 1306 | NaN | 1 | Oliva y Ocana, Dona. Fermina | female | 39.0 | 0 | 0 | PC 17758 | 66.3438 | C105 | C | Dona | 1 | Solo | 108.900000 | C | Middle Aged Adult |
| 1306 | 1307 | NaN | 3 | Saether, Mr. Simon Sivertsen | male | 38.5 | 0 | 0 | SOTON/O.Q. 3101262 | 7.2500 | NaN | S | Mr | 1 | Solo | 7.250000 | Unknown | Middle Aged Adult |
| 1307 | 1308 | NaN | 3 | Ware, Mr. Frederick | male | 24.0 | 0 | 0 | 359309 | 8.0500 | NaN | S | Mr | 1 | Solo | 8.050000 | Unknown | Young Adult |
| 1308 | 1309 | NaN | 3 | Peter, Master. Michael J | male | 33.0 | 1 | 1 | 2668 | 22.3583 | NaN | C | Master | 3 | Small | 7.452767 | Unknown | Young Adult |
1309 rows × 18 columns
In [159]:
plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
sns.boxplot(data = df , x=df['Fare'])
plt.title(f"Boxplot: Fare without Outliers")
plt.subplot(1,2,2)
sns.histplot(df['Fare'], kde=True)
plt.title(f"Distribution of Fare")
plt.show()
Univariate Analysis after removing outliers:¶
In [160]:
num_cat_analysis(df, "Fare", "Survived", agg="mean")
6.Analysis After cleaning Data:¶
a. Bivariate Analysis¶
i. Age:¶
In [163]:
for i in all_cat_columns:
num_cat_analysis(df, "Age", i, agg="mean")
In [165]:
num_num_analysis(df, "Age", "Fare")
ii. Fare:¶
In [164]:
for i in all_cat_columns:
num_cat_analysis(df, "Fare", i, agg="mean")
In [166]:
num_num_analysis(df, "Fare","Age")
iii. Individual Fare:¶
In [174]:
for i in all_cat_columns:
num_cat_analysis(df, "Individual Fare", i, agg="mean")
In [175]:
num_num_analysis(df, "Individual Fare","Age")
iv. Age group :¶
In [170]:
cat_survial(df,"Age Group")
In [171]:
pclass_cat_plot(df,"Age Group")
In [173]:
temp_df = df.groupby(["Age Group","Sex"]).size().reset_index(name="Count")
fig = px.bar(temp_df, x="Age Group", y='Count' , color = "Sex", title="Count of Male/Female across Age Group" )
fig.update_layout(
title_x = 0.5,
title_font = dict(size = 18 , color = "Green"))
b. Multivariate Analysis¶
Age + Sex + Survived :¶
In [176]:
plt.figure(figsize=(8,5))
sns.boxplot(data=df, x="Sex", y="Age", hue="Survived")
plt.title("Age Distribution by Sex and Survival")
plt.show()
g = sns.FacetGrid(df, col="Sex", hue="Survived", height=4)
g.map(sns.kdeplot, "Age", fill=True, alpha=0.6)
g.add_legend()
plt.show()
Fare + Pclass + Survived:¶
In [177]:
plt.figure(figsize=(8,5))
sns.scatterplot(data=df, x="Fare", y="Pclass", hue="Survived", alpha=0.7)
plt.title("Fare vs Pclass by Survival")
plt.show()
plt.figure(figsize=(8,5))
sns.boxplot(data=df, x="Pclass", y="Fare", hue="Survived")
plt.title("Fare Distribution by Pclass and Survival")
plt.show()
Age + Fare + Survived :¶
In [178]:
plt.figure(figsize=(8,5))
sns.scatterplot(data=df, x="Age", y="Fare", hue="Survived", alpha=0.7)
plt.title("Age vs Fare by Survival")
plt.show()
sns.jointplot(data=df, x="Age", y="Fare", hue="Survived", kind="kde")
plt.suptitle("KDE Joint Distribution – Age & Fare", y=1.02)
plt.show()
Age + Pclass + Survived:¶
In [179]:
age_pclass_survived_plots(df)
survived + deck + pclass:¶
In [180]:
survived_deck_pclass_plots(df)
In [ ]: